Testing IHS

library(tidyverse)
library(terra)
library(sf)
library(leaflet)
library(scico)
library(flextable)

Map clustered data

To make fuzzy maps of each set of clusters, first import the FCM models generated in the previous step. Also load one of the input rasters, which can be used as a framework to re-spatialise the data. Doesn’t matter which.

model_files <- 
  list.files(file.path('models'), pattern = '\\.rds', full.names = TRUE)

models <- lapply(model_files, readRDS)
names(models) <- paste0('fcm_',
                        sprintf('%02d', 
                                sapply(models, function(i) nrow(i[['centers']]))))

grid <- rast(file.path('data_spatial', 'processed_covariates', 'slope.tif'))

The following function takes the outputs of a model and maps each set of cluster values onto a grid that matches the input covariates. Since the cases are arranged in cell order throughout the analysis, the spatial arrangement of the clusters can be reconstituted.

fuzzy_maps <- function(model = NULL, grid = NULL) {
  # get indexes of cells that contain data
  valcells <- which(!is.na(terra::values(grid)))
  # get number of clusters - 1 cluster, 1 map
  cen <- length(model$size)
  # check that cases == cells or the output maps will be scrambled
  cases  <- nrow(model$membership)
  
  if(length(valcells) != cases) {
    stop("cluster cases doesn't match number of non-NA grid cells - check inputs")
  }
  
  out <- lapply(seq(cen), function(m) {
    grid[valcells] <- model$membership[, m]
    grid
  })
  
  names(out) <- paste0('Cluster_', sprintf('%02d', seq(cen)))
  terra::rast(out)
}

# for example,
clmaps_04 <- fuzzy_maps(models[['fcm_04']], grid)

So despite the poor validity statistics, we see some patterns that are recognisable, e.g. the steep areas appear to be highlighted in Cluster 3.

Harden maps

These maps can be hardened into binary surfaces at a set membership cut-off. A value of 0.6 is used below per the results of the sensitivity testing in (2013, sec. 3.4):

binary_maps <- function(model = NULL, grid = NULL, threshold = 0.6) {
  # reclassify the membership matrix to 0/1 by threshold
  mem_bin <- apply(model$membership, MARGIN = c(1,2), function(i) {
    # don't use NA here, makes rep grade calc harder later
    if(i < threshold) { 0 } else { 1 }
  })
  
  valcells <- which(!is.na(terra::values(grid)))
  cen <- length(model$size)
  cases  <- nrow(mem_bin)
  
  if(length(valcells) != cases) {
    stop("cluster cases doesn't match number of non-NA grid cells - check inputs")
  }
  
  out <- lapply(seq(cen), function(m) {
    grid[valcells] <- mem_bin[, m]
    grid
  })
  
  names(out) <- paste0('Cluster_', sprintf('%02d', seq(cen)))
  terra::rast(out)
}

binmaps_04 <- binary_maps(models[['fcm_04']], grid, 0.6)

Map representative grade

The ‘representative grade’ can now be calculated - the number of times each pixel appears in a cluster (with a sufficiently high membership each time) across multiple c.

using c = 4,5,6,7,8 for now

# already did c = 4, so
clmaps_05 <- fuzzy_maps(models[['fcm_05']], grid)
clmaps_06 <- fuzzy_maps(models[['fcm_06']], grid)
clmaps_07 <- fuzzy_maps(models[['fcm_07']], grid)
clmaps_08 <- fuzzy_maps(models[['fcm_08']], grid)
binmaps_05 <- binary_maps(models[['fcm_05']], grid, 0.6)
binmaps_06 <- binary_maps(models[['fcm_06']], grid, 0.6)
binmaps_07 <- binary_maps(models[['fcm_07']], grid, 0.6)
binmaps_08 <- binary_maps(models[['fcm_08']], grid, 0.6)

Representative grade is a simple overlay-and-sum operation using the hardened cluster maps.

rep_04 <- sum(binmaps_04) # maps whether cells were in any cluster
rep_05 <- sum(binmaps_05)
rep_06 <- sum(binmaps_06)
rep_07 <- sum(binmaps_07)
rep_08 <- sum(binmaps_08)

representative_grade <- sum(c(rep_05, rep_05, rep_06, rep_07, rep_08))

Determine environmental cluster chains

The ‘environmental cluster chains’ for each pixel now need to be determined. This effectively assigns a cluster/iteration signature, e.g. a pixel representative of class 1 in the 3-cluster surface and class 3 in the 5-cluster surface would have a label like ‘1-0-3-0-0’.

Each of these cluster chains will occupy a set number of pixels. Yang et al. (2013) propose that the chains that have both a high representative grade and a large area should be targeted first for sampling, and that pixels with a high average membership within those areas are the best places to go.

Firstly, the binary maps need to be recoded so that they have a cluster number - so e.g. the map for cluster 3 in the c = 4 solution takes values of 3/0 instead of 1/0

renumber_binmaps <- function(binmaps = NULL) {
  nl <- terra::nlyr(binmaps)
  for(i in seq(nl)) { binmaps[[i]][binmaps[[i]] == 1] <- i }
  binmaps
}

binmaps_04_rn <- renumber_binmaps(binmaps_04) 
binmaps_05_rn <- renumber_binmaps(binmaps_05)
binmaps_06_rn <- renumber_binmaps(binmaps_06)
binmaps_07_rn <- renumber_binmaps(binmaps_07)
binmaps_08_rn <- renumber_binmaps(binmaps_08)

Next, stack all of the input clustering layers together, grouped by c using sum().

all_clusters <- rast(list(sum(binmaps_04_rn),
                          sum(binmaps_05_rn),
                          sum(binmaps_06_rn),
                          sum(binmaps_07_rn),
                          sum(binmaps_08_rn)))
names(all_clusters) <- paste0('fcm_', sprintf('%02d', 4:8))

This allows us to extract all of the ECC chain data to a table with

get_clusterchains <- function(clmap = NULL) {
  as.data.frame(clmap)  %>% 
    # create ecc name and calc rep grade and a UID
    mutate(ecc_name = apply(., MARGIN = 1, function(i) {
    if(all(is.na(i))) { return(NA_character_) }
    paste0(i, collapse = '-') 
    }),
    rep_grade = rowSums(. != 0)) %>% 
    # calc rep area
    group_by(across(everything())) %>%
    summarise(rep_area = n()) %>% 
    ungroup() %>% 
    mutate(across(-ecc_name, na_if, 0)) %>% 
    filter(rep_grade > 0) %>% 
    arrange(desc(rep_grade), desc(rep_area)) %>% 
    # will need later...
    mutate(uid = row_number()) 
}

all_eccs <- get_clusterchains(all_clusters)

And map out the location of each chain.

# this needs some work
map_clusterchains <- function(clmap = NULL, eccs = NULL, 
                              id_field = NULL, grid = NULL) {
  dat <- as.data.frame(clmap) %>% 
    mutate(ecc_name = apply(., MARGIN = 1, function(i) {
    if(all(is.na(i))) { return(NA_character_) }
    paste0(i, collapse = '-') 
    })) %>% 
    left_join(., eccs[, c('ecc_name', id_field)], by = 'ecc_name')
  
  grid[which(!is.na(values(grid)))] <- dat[[id_field]]
  grid
}

ecc_map <- map_clusterchains(all_clusters, all_eccs, 'uid', grid)

Next, tag the chains Yang et al. (2013) regard as redundant, in that they are ‘contained’ in a chain with a higher representative grade (e.g. 0-2-0-0-8 would be redundant to sample if 2-2-2-2-8 was already targeted).

# https://stackoverflow.com/questions/63806865/flagging-redundant-rows-with-na/
# if its stupid and it works, its not stupid >.>
tag_redundant <- function(dat = NULL, n_c = NULL) {

    na_count <- rowSums(is.na(dat[, 1:n_c]))

    strings <- apply(dat[, 1:n_c], MARGIN = 1, function(row) {
      row[is.na(row)] <- '.'
      paste0(row, collapse ='')
    })
    
    dat$is_unique <- 
    sapply(seq_along(strings), function(i) {
      if(na_count[i] == 0) { return(TRUE) } # no redundancy where rg == max rg
      test_targets <- strings[na_count <= na_count[i]]
      test_targets <- test_targets[!test_targets %in% strings[i]]
      !any(grepl(strings[i], test_targets))
    })

    dat
}

all_eccs <- tag_redundant(all_eccs, 5)

ECC data, top ten cluster chains

ecc_name

rep_grade

rep_area

uid

is_unique

2-2-2-2-8

5

1,107

1

TRUE

3-3-6-6-6

5

275

2

TRUE

1-1-5-1-1

5

134

3

TRUE

1-5-5-1-5

5

113

4

TRUE

2-1-1-7-2

5

80

5

TRUE

1-1-1-7-1

5

37

6

TRUE

4-4-3-5-4

5

10

7

TRUE

1-1-1-7-2

5

1

8

TRUE

2-0-1-7-2

4

382

9

FALSE

0-1-1-7-2

4

340

10

FALSE

This signifies that 21 ECCs are unique. Of those, only 9 have a representative area of > 20 cells (in this case, that’s 0.2 ha). These are the areas in which sample points should be chosen. This is the next step, which requires the following data.

if(!dir.exists(file.path('sample_plan'))) {
  dir.create(file.path('sample_plan'))
}

if(!dir.exists(file.path('data_spatial', 'model_outputs'))) {
  dir.create(file.path('data_spatial', 'model_outputs'))
}

write_csv(all_eccs, file.path('sample_plan', 'ECC_table.csv'))

writeRaster(ecc_map, 
            file.path('data_spatial', 'model_outputs', 'ecc_map.tif'),
            overwrite = TRUE,
            datatype = 'FLT4S',
            gdal = "COMPRESS=LZW")

writeRaster(all_clusters,
            file.path('data_spatial', 'model_outputs', 'all_clusters.tif'),
            overwrite = TRUE,
            datatype = 'FLT4S',
            gdal = "COMPRESS=LZW")

# need all the membership data for picking sites too,
purrr::map2(
  list(clmaps_04, clmaps_05, clmaps_06, clmaps_07, clmaps_08), 4:8, 
  function(i, j) {
    outnm <- paste0('cluster_', sprintf('%02d', j), '_membership.tif')
    writeRaster(i,
                file.path('data_spatial', 'model_outputs', outnm),
                overwrite = TRUE,
                datatype = 'FLT4S',
                gdal = "COMPRESS=LZW")
    })

References

Yang, L., A. X. Zhu, F. Qi, C. Z. Qin, B. Li, and T. Pei. 2013. “An Integrative Hierarchical Stepwise Sampling Strategy for Spatial Sampling and Its Application in Digital Soil Mapping.” International Journal of Geographical Information Science 27 (1): 1–23. https://doi.org/10.1080/13658816.2012.658053.